HW 01

Author

Yashi Mi

0 - Setup

if (!require("pacman")) 
  install.packages("pacman")

# use this line for installing/loading
pacman::p_load (tidyverse,
              palmerpenguins )

devtools::install_github("tidyverse/dsbox")

library(here)

1 - Road traffic accidents in Edinburgh

accidents <- read_csv(here("data","accidents.csv"))
glimpse(accidents)
Rows: 768
Columns: 31
$ id                 <chr> "2018950000002", "2018950000006", "2018950000012", …
$ easting            <dbl> 327174, 324874, 330500, 321890, 320120, 331752, 325…
$ northing           <dbl> 670941, 672457, 671750, 671640, 669330, 667988, 674…
$ longitude          <dbl> -3.167032, -3.204252, -3.114026, -3.251772, -3.2794…
$ latitude           <dbl> 55.92600, 55.93926, 55.93376, 55.93145, 55.91041, 5…
$ police_force       <dbl> 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95,…
$ severity           <chr> "Slight", "Slight", "Slight", "Slight", "Slight", "…
$ vehicles           <dbl> 1, 1, 2, 3, 2, 3, 1, 1, 1, 2, 2, 1, 1, 1, 2, 1, 2, …
$ casualties         <dbl> 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, …
$ date               <chr> "31/12/2018", "30/12/2018", "03/01/2018", "01/01/20…
$ day_of_week        <chr> "Monday", "Sunday", "Wednesday", "Monday", "Thursda…
$ time               <time> 14:59:00, 12:50:00, 14:34:00, 02:25:00, 09:00:00, …
$ district           <dbl> 923, 923, 923, 923, 923, 923, 923, 923, 923, 923, 9…
$ highway            <chr> "S12000036", "S12000036", "S12000036", "S12000036",…
$ first_road_class   <chr> "Unclassified", "Unclassified", "A(M) road", "A(M) …
$ first_road_number  <dbl> 0, 0, 6095, 71, 0, 720, 0, 0, 1, 700, 0, 0, 0, 0, 0…
$ road_type          <chr> "Single carriageway", "Single carriageway", "Single…
$ speed_limit        <dbl> 20, 20, 20, 30, 30, 70, 20, 30, 20, 20, 20, 20, 20,…
$ junction_detail    <chr> "Other junction", "Other junction", "Crossroads", "…
$ junction_control   <chr> "Give way or uncontrolled", "Give way or uncontroll…
$ second_road_class  <chr> "Unclassified", "Missing / Out of range", "A-road",…
$ second_road_number <dbl> 0, -1, 6106, 0, 0, 720, 0, 0, 0, 700, 0, 0, 0, 0, 0…
$ ped_cross_human    <chr> "None within 50 metres", "None within 50 metres", "…
$ ped_cross_physical <chr> "Pedestrian phase at traffic signal junction", "No …
$ light              <chr> "Daylight", "Daylight", "Daylight", "Darkness - lig…
$ weather            <chr> "Fine + no high winds", "Fine + no high winds", "Fi…
$ road_surface       <chr> "Dry", "Dry", "Wet or damp", "Wet or damp", "Wet or…
$ special_condition  <chr> "None", "None", "None", "None", "None", "None", "No…
$ hazard             <chr> "None", "None", "None", "None", "None", "None", "No…
$ urban_rural        <dbl> 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ police             <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Y…
accidents <- accidents |>
  mutate(day_type = case_when(
    day_of_week %in% c("Saturday", "Sunday") ~ "Weekend",
    day_of_week %in% c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday") ~ "Weekday",
    TRUE ~ as.character(day_of_week)))
ggplot(accidents, aes(x = time, fill = severity)) +
  geom_density(alpha=0.6) +
  facet_wrap(~day_type, ncol=1) +
  scale_fill_manual("Severity", values = c("#8E6C8A", "#5E9CA0", "#F4E04D")) +
  labs(title = "Number of accidents throughout the day",
    subtitle = "By day of week and severity",
    x = "Time of day",
    y = "Density") +
  theme_minimal(base_size = 14)

Note

The figure illustrates the distribution of road traffic accidents in Edinburgh across different times of day, separated by weekday and weekend, and categorized by severity (Fatal, Serious, Slight).

On weekdays, the number of serious and slight accidents peaks between 16:00 and 18:00, likely corresponding to evening rush hours. Fatal accidents show a smaller peak around midday (12:00).

On weekends, the number of slight accidents also peaks around 16:00, but serious accidents tend to peak slightly later, between 18:00 and 20:00. Notably, there are no fatal accidents reported on weekends in this dataset.

2 - NYC marathon winners

nyc_marathon <- read_csv(here("data","nyc_marathon.csv"))
glimpse(nyc_marathon)
Rows: 102
Columns: 7
$ year     <dbl> 1970, 1970, 1971, 1971, 1972, 1972, 1973, 1973, 1974, 1974, 1…
$ name     <chr> "Gary Muhrcke", NA, "Norman Higgins", "Beth Bonner", "Sheldon…
$ country  <chr> "United States", NA, "United States", "United States", "Unite…
$ time     <time> 02:31:38,       NA, 02:22:54, 02:55:22, 02:27:52, 03:08:41, …
$ time_hrs <dbl> 2.527222, NA, 2.381667, 2.922778, 2.464444, 3.144722, 2.36500…
$ division <chr> "Men", "Women", "Men", "Women", "Men", "Women", "Men", "Women…
$ note     <chr> "Course record", "No woman finishers", "Course record", "Worl…
nyc_marathon_clean <- nyc_marathon |>
  filter(!is.na(time_hrs))

ggplot(nyc_marathon_clean, aes(x = time_hrs))+
  geom_histogram(binwidth = 0.0625)+
   labs(x = "Finishing Time (hours)",
    y = "Count" )+
    theme_minimal(base_size = 14)

ggplot(nyc_marathon_clean, aes(y = time_hrs))+
  geom_boxplot()+
   labs(y = "Finishing Time (hours)" )+
    theme_minimal(base_size = 14)

Note

The histogram shows the overall distribution of finishing times for all marathon runners, making it easier to observe the frequency of different time intervals. In contrast, the box plot summarizes key statistics such as the median, interquartile range, and potential outliers.

ggplot(nyc_marathon_clean, aes(x = division,y = time_hrs, fill = division))+
  geom_boxplot()+
  scale_fill_manual(values = c("Blue","Red"))+
   labs(x = "Gender",
     y = "Finishing Time (hours)" )+
    theme_minimal(base_size = 14)

Note

This figure compares the distribution of marathon finishing times between male and female runners. It clearly shows that female runners tend to have longer finishing times than male runners.

ggplot(nyc_marathon_clean, aes(x = division,y = time_hrs, fill = division))+
  geom_boxplot(show.legend = FALSE)+
  scale_fill_manual(values = c("Blue","Red"))+
   labs(x = "Gender",
     y = "Finishing Time (hours)" )+
    theme_minimal(base_size = 14)

Note

Since the gender information is already clearly labeled on the x-axis, the color legend becomes redundant. Removing the legend increases the data-to-ink ratio by reducing unnecessary visual elements.

ggplot(nyc_marathon_clean, aes(x = year,y = time_hrs, color   = division,shap = division))+
  geom_point(stat = "summary", fun = "mean")+
  scale_color_manual(values = c("Blue","Red")) +
  scale_shape_manual(values = c("5","7")) +
   labs(x = "year",
    y = "Finishing Time (hours)",
    color = "Gender",
    shape = "Gender")+
    theme_minimal(base_size = 14)

Note

The plot shows how marathon finishing times have changed over the past decades. It reveals a notable downward trend in finishing times for both men and women. Additionally, men consistently finished faster than women throughout the recorded years.

3 - US counties

Note

“geom_point(aes(x = median_edu, y = median_hh_income))”

This line creates a scatter plot to show the relationship between median years of education and median household income across U.S. counties.

“geom_boxplot(aes(x = smoking_ban, y = pop2017))”

This line creates a boxplot showing the distribution of 2017 population sizes grouped by smoking ban status.

However, combining these two plots doesn’t make sense because they use different x-axes: one is a continuous variable (median_edu) and the other is a categorical variable (smoking_ban).

Note

The second plot makes it much easier to compare poverty levels across people from different median education levels. This is because all four groups share the same y-axis scale (poverty), which facilitates direct visual comparison.

This suggests that when your goal is to compare values on the y-axis, you should place the faceting variable across columns to keep the y-axis consistent. Conversely, if you are focusing on comparisons along the x-axis, it’s better to facet across rows.

library(openintro)
glimpse(county)  
Rows: 3,142
Columns: 15
$ name              <chr> "Autauga County", "Baldwin County", "Barbour County"…
$ state             <fct> Alabama, Alabama, Alabama, Alabama, Alabama, Alabama…
$ pop2000           <dbl> 43671, 140415, 29038, 20826, 51024, 11714, 21399, 11…
$ pop2010           <dbl> 54571, 182265, 27457, 22915, 57322, 10914, 20947, 11…
$ pop2017           <int> 55504, 212628, 25270, 22668, 58013, 10309, 19825, 11…
$ pop_change        <dbl> 1.48, 9.19, -6.22, 0.73, 0.68, -2.28, -2.69, -1.51, …
$ poverty           <dbl> 13.7, 11.8, 27.2, 15.2, 15.6, 28.5, 24.4, 18.6, 18.8…
$ homeownership     <dbl> 77.5, 76.7, 68.0, 82.9, 82.0, 76.9, 69.0, 70.7, 71.4…
$ multi_unit        <dbl> 7.2, 22.6, 11.1, 6.6, 3.7, 9.9, 13.7, 14.3, 8.7, 4.3…
$ unemployment_rate <dbl> 3.86, 3.99, 5.90, 4.39, 4.02, 4.93, 5.49, 4.93, 4.08…
$ metro             <fct> yes, yes, no, yes, yes, no, no, yes, no, no, yes, no…
$ median_edu        <fct> some_college, some_college, hs_diploma, hs_diploma, …
$ per_capita_income <dbl> 27841.70, 27779.85, 17891.73, 20572.05, 21367.39, 15…
$ median_hh_income  <int> 55317, 52562, 33368, 43404, 47412, 29655, 36326, 436…
$ smoking_ban       <fct> none, none, partial, none, none, none, NA, NA, none,…
county_clean <- county |>
  filter(!is.na(homeownership),!is.na(poverty))

ggplot(county_clean, aes(x = homeownership, y = poverty))+
  geom_point()+
  labs(title = "Plot A") 

ggplot(county_clean, aes(x = homeownership, y = poverty))+
  geom_point()+
  geom_smooth(method = "gam", se = FALSE, color = "Blue")+
  labs(title = "Plot B") 

ggplot(county_clean, aes(x = homeownership, y = poverty, group = metro))+
  geom_point()+
  geom_smooth(method = "gam",se = FALSE, color = "Green")+
  labs(title = "Plot C") 

ggplot(county_clean, aes(x = homeownership, y = poverty, group = metro))+
  geom_smooth(method = "gam", se = FALSE, color = "Blue")+
  geom_point()+
  labs(title = "Plot D") 

ggplot(county_clean, aes(x = homeownership, y = poverty, group = metro))+
  geom_point(aes(color = metro))+
  geom_smooth(method = "gam", se = FALSE, color = "Blue", aes(linetype = metro))+
  guides(color = guide_legend(order = 2),    
    linetype = guide_legend(order = 1)) +
  labs(title = "Plot E") 

ggplot(county_clean, aes(x = homeownership, y = poverty, group = metro, color =   metro))+
  geom_point()+
  geom_smooth(method = "gam", se = FALSE, )+
  labs(title = "Plot F") 

ggplot(county_clean, aes(x = homeownership, y = poverty))+
  geom_point(aes(group = metro, color = metro))+
  geom_smooth(method = "gam",  se = FALSE, color = "Blue")+
  labs(title = "Plot G") 

ggplot(county_clean, aes(x = homeownership, y = poverty))+
  geom_point(aes(group = metro, color = metro))+
  labs(title = "Plot H") 

4 - Rental apartments in SF

credit <- read_csv(here("data","credit.csv"))
glimpse(credit)
Rows: 400
Columns: 5
$ balance <dbl> 333, 903, 580, 964, 331, 1151, 203, 872, 279, 1350, 1407, 0, 2…
$ income  <dbl> 14.891, 106.025, 104.593, 148.924, 55.882, 80.180, 20.996, 71.…
$ student <chr> "No", "Yes", "No", "No", "No", "No", "No", "No", "No", "Yes", …
$ married <chr> "Yes", "Yes", "No", "No", "Yes", "No", "No", "No", "No", "Yes"…
$ limit   <dbl> 3606, 6645, 7075, 9504, 4897, 8047, 3388, 7114, 3300, 6819, 81…
library(scales)

ggplot(credit, aes(x = income,y = balance, color = student, shape = student ))+
  geom_point(alpha = 0.6,show.legend = FALSE)+
  geom_smooth(method = "lm", se = FALSE, show.legend = FALSE)+
  labs(x = "Income", y = "Credit card balance")+
  facet_grid(student ~ married, labeller = label_both)+
  scale_x_continuous(labels = dollar_format(prefix = "$", suffix = "K"))+
  scale_y_continuous(labels = dollar_format(prefix = "$"))+
  theme_minimal(base_size = 14) +
  theme( strip.background = element_rect(fill = "grey90"),
         panel.border = element_rect(color = "Black", fill = NA))

Note

This figure illustrates the relationship between income and credit card balance, segmented by student status and marital status. Overall, there is a positive correlation between income and credit card balance. Compared to non-students, students tend to carry higher credit card balances at similar income levels. Among students, married individuals generally have lower balances than their unmarried peers with comparable incomes.

Note

I believe that marital status and student status are useful predictors, in addition to income, for predicting credit card balance. Students tend to have higher credit card balances than non-students at similar income levels. Additionally, married individuals generally have lower balances than unmarried individuals, which suggests a potential interaction effect between marital status and student status.

library(scales)

credit <- credit|>
  mutate(credit_utilization = balance/limit)
    
ggplot(credit, aes(x = income,y = credit_utilization, color = student, shape = student ))+
  geom_point(alpha = 0.6,show.legend = FALSE)+
  geom_smooth(method = "lm", se = FALSE, show.legend = FALSE)+
  labs(x = "Income", y = "Credit utilization")+
  facet_grid(student ~ married, labeller = label_both)+
  scale_x_continuous(labels = dollar_format(prefix = "$", suffix = "K"))+
  scale_y_continuous(labels = label_percent(suffix = "%"))+
  theme_minimal(base_size = 14) +
  theme( strip.background = element_rect(fill = "grey90"),
         panel.border = element_rect(color = "Black", fill = NA))

Note

The relationships between income and credit utilization differ from those between income and credit card balance. Among non-students, income and credit utilization show a positive correlation, consistent with the pattern observed for credit card balance. However, for student groups, the correlation between income and credit utilization appears to follow an opposite trend compared to the relationship with credit card balance. Additionally, married individuals generally have lower credit utilization than unmarried individuals at similar income levels.

5 - Napoleon’s march.